{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "--2019-04-15 19:22:45--  http://cs.stanford.edu/~danqi/data/cnn.tar.gz\n",
      "Resolving cs.stanford.edu (cs.stanford.edu)... 171.64.64.64\n",
      "Connecting to cs.stanford.edu (cs.stanford.edu)|171.64.64.64|:80... connected.\n",
      "HTTP request sent, awaiting response... 301 Moved Permanently\n",
      "Location: https://cs.stanford.edu/~danqi/data/cnn.tar.gz [following]\n",
      "--2019-04-15 19:22:45--  https://cs.stanford.edu/~danqi/data/cnn.tar.gz\n",
      "Connecting to cs.stanford.edu (cs.stanford.edu)|171.64.64.64|:443... connected.\n",
      "HTTP request sent, awaiting response... 200 OK\n",
      "Length: 571560380 (545M) [application/x-gzip]\n",
      "Saving to: ‘cnn.tar.gz’\n",
      "\n",
      "cnn.tar.gz          100%[===================>] 545.08M  11.9MB/s    in 49s     \n",
      "\n",
      "2019-04-15 19:23:35 (11.1 MB/s) - ‘cnn.tar.gz’ saved [571560380/571560380]\n",
      "\n",
      "cnn/\n",
      "cnn/dev.txt\n",
      "cnn/train.txt\n",
      "cnn/test.txt\n"
     ]
    }
   ],
   "source": [
    "!wget http://cs.stanford.edu/~danqi/data/cnn.tar.gz\n",
    "!tar -xvzf cnn.tar.gz"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import matplotlib.pyplot as plt"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "keys = ['train', 'dev', 'test']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "p, q, a = {}, {}, {}\n",
    "for k in keys :\n",
    "    file = open('cnn/' + k + '.txt').read().strip().split('\\n\\n')\n",
    "    file = [x.split('\\n') for x in file]\n",
    "    p[k] = [x[2] for x in file]\n",
    "    q[k] = [x[0] for x in file]\n",
    "    a[k] = [x[1] for x in file]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "entities = {}\n",
    "for k in p :\n",
    "    entities[k] = []\n",
    "    for x in p[k] :\n",
    "        entities[k] += [y for y in x.split() if y.startswith('@entity')]\n",
    "    \n",
    "    entities[k] = set(entities[k])\n",
    "    \n",
    "f = open('entity_list.txt', 'w')\n",
    "f.write('\\n'.join(list(entities['train'])))\n",
    "f.close()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "def generate_possible_answers(p) :\n",
    "    possible_answers = []\n",
    "    for w in p.split() :\n",
    "        if w.startswith('@entity') :\n",
    "            possible_answers.append(w)\n",
    "    \n",
    "    return \";\".join(list(set(possible_answers)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "df_paragraphs = []\n",
    "df_questions = []\n",
    "df_answers = []\n",
    "df_possible_answers = []\n",
    "df_exp_splits = []\n",
    "\n",
    "for k in keys :\n",
    "    df_paragraphs += p[k]\n",
    "    df_questions += q[k]\n",
    "    df_answers += a[k]\n",
    "    df_possible_answers += [generate_possible_answers(x) for x in p[k]]\n",
    "    df_exp_splits += [k] * len(p[k])\n",
    "    \n",
    "df = {'paragraph' : df_paragraphs, 'question' : df_questions, 'answer' : df_answers, \n",
    "      'exp_split' : df_exp_splits, 'possible_answers' : df_possible_answers}\n",
    "df = pd.DataFrame(df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.to_csv('cnn_dataset.csv', index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Vocabulary size :  70187\n",
      "Adding Answers to vocab too . Should only be occuring for CNN dataset\n",
      "Found 35237 words in model out of 70190\n"
     ]
    }
   ],
   "source": [
    "%run \"../preprocess_data_QA.py\" --data_file cnn_dataset.csv --output_file ./vec_cnn.p --all_answers_file entity_list.txt \\\n",
    "--word_vectors_type fasttext.simple.300d --min_df 8 --add_answers_to_vocab"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
